Twitter sentiment analysis.
import sys
sys.path.append('/Users/poudel/opt/miniconda3/envs/nlp/lib/python3.7/site-packages')
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
import mlxtend
import plotly_express as px
pd.options.plotting.backend = "plotly"
pd.set_option('max_columns',100)
pd.set_option('max_colwidth',1000)
import time,os,json,sys
time_start_notebook = time.time()
home = os.path.expanduser('~')
SEED=100
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
print([(x.__name__,x.__version__) for x in [np,pd,sns,sklearn,mlxtend,px]])
#=========Visualization
import plotly
import plotly.offline as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly import tools
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=False)
#========= NLP
import re
import string
import nltk
import spacy
import textblob
import gensim
import texthero
from urllib.parse import urlparse
from nltk.corpus import stopwords
import texthero as hero
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
print([(x.__name__,x.__version__) for x in [nltk,spacy,textblob,gensim]])
#=======OTHERS
import ast
import scipy
import multiprocessing as mp
import gc
import operator
from collections import defaultdict
#=====Warnigns
import warnings
warnings.simplefilter("ignore")
# plottig warnings
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;
import ast
df_combined = pd.read_csv('../data/processed/df_combined_clean.csv')
# Variables
target = 'label'
maincol = 'tweet'
mc = maincol + '_clean'
mcl = maincol + '_lst_clean'
mce = mc + '_emoji'
mcle = mcl + '_emoji'
# we need to make list as list type
df_combined[mcl] = df_combined[mcl].apply(ast.literal_eval)
df_combined[mcle] = df_combined[mcle].apply(ast.literal_eval)
df_train = df_combined[~df_combined[target].isnull()]
df_test = df_combined[df_combined[target].isnull()]
print(f"shape df_train: {df_train.shape}")
print(f"shape df_test: {df_test.shape}")
df_train.head(2).append(df_train.tail(2))
df = df_train
df_pos = df[df['label']==0.0] # it's 0 NOT 1
df_neg = df[df['label']==1.0]
sns.countplot(df[target])
df[target].value_counts().plot.bar()
df1 = df.sample(1000)
df1['tfidf'] = df[mce].pipe(hero.tfidf)
df1['pca'] = df1['tfidf'].pipe(hero.pca)
df1['kmeans_labels'] = df1['tfidf'].pipe(hero.kmeans,n_clusters=2)
hero.scatterplot(df1, 'pca', color='kmeans_labels')
df.head(2).T
df[mcle].head(2)[0]
arr_all_words = df[mcle].sum()
arr_pos_words = df[df[target]==0.0][mcle].sum()
arr_neg_words = df[df[target]==1.0][mcle].sum()
print(f"len arr_all_words: {len(arr_all_words)}")
print(f"len arr_pos_words: {len(arr_pos_words)}")
print(f"len arr_neg_words: {len(arr_neg_words)}")
from collections import Counter
df_freq = pd.DataFrame(Counter(arr_all_words).most_common())
df_freq_pos = pd.DataFrame(Counter(arr_pos_words).most_common())
df_freq_neg = pd.DataFrame(Counter(arr_neg_words).most_common())
df_freq_pos.head()
df_freq = pd.DataFrame(np.unique(arr_all_words,return_counts=True)).T
df_freq.head(2).append(df_freq.tail(2))
fdist = nltk.FreqDist(arr_all_words)
print([i for i in dir(fdist) if i[0]!='_'])
df_freq = pd.DataFrame(fdist.most_common(20))
df_freq.head()
fdist.plot(20)
df_freq = hero.top_words(df[mce]).to_frame()
df_freq.head(2)
hero.top_words(df[mce],normalize=True).to_frame().head().mul(100)
hero.wordcloud(df[mce])
from wordcloud import WordCloud
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=[30, 30])
wordcloud1 = WordCloud( background_color='white',
width=800,
height=600
).generate(' '.join(arr_pos_words))
ax1.imshow(wordcloud1)
ax1.axis('off')
ax1.set_title('Positive Tweets',fontsize=40);
wordcloud2 = WordCloud( background_color='white',
width=800,
height=600
).generate(' '.join(arr_neg_words))
ax2.imshow(wordcloud2)
ax2.axis('off')
ax2.set_title('Negative Tweets',fontsize=40);
from plotly_wordcloud import plotly_wordcloud
text = " ".join(arr_pos_words)
fig = plotly_wordcloud(text)
fig['layout']['title'] = 'Wordcloud for +Ve Tweets'
fig['layout']['height'] = 800
fig['layout']['width'] = 800
py.iplot(fig)
import inspect
# inspect.getsourcelines(plotly_wordcloud)
fig = px.treemap(df_pos_uni.head(20),
path=['Word'],values='Count',
title='Top +Ve Twitter Words')
fig['layout']['title']['x'] = 0.5
fig.show()
fig = px.treemap(df_neg_uni.head(20),path=['Word'],values='Count')
fig.update_layout(
title={
'text': "Top -Ve Twitter Words",
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'})
fig.show()
df.columns
import warnings
warnings.simplefilter("ignore")
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;
NEG_TWEETS = df_train[target] == 1
def compare_distplots(df_train,features):
fig, axes = plt.subplots(ncols=2, nrows=len(features), figsize=(20, 50), dpi=100)
for i, feature in enumerate(features):
sns.distplot(df_train.loc[~NEG_TWEETS][feature], label='Positive', ax=axes[i][0], color='green')
sns.distplot(df_train.loc[NEG_TWEETS][feature], label='Negative', ax=axes[i][0], color='red')
sns.distplot(df_train[feature], label='Training', ax=axes[i][1])
sns.distplot(df_test[feature], label='Test', ax=axes[i][1])
for j in range(2):
axes[i][j].set_xlabel('')
axes[i][j].tick_params(axis='x', labelsize=20)
axes[i][j].tick_params(axis='y', labelsize=20)
axes[i][j].legend(fontsize=20)
axes[i][0].set_title(f'{feature} Target Distribution in Training Set', fontsize=20)
axes[i][1].set_title(f'{feature} Training & Test Set Distribution', fontsize=20)
plt.show()
features = ['total_length', 'num_words', 'num_sent',
'num_unique_words', 'num_words_title', 'num_uppercase',
'num_exclamation_marks' ]
compare_distplots(df_train,features)
features = ['num_question_marks',
'num_punctuation','num_symbols', 'num_digits',
'avg_word_len', 'avg_uppercase']
compare_distplots(df_train,features)
note = """
NOTE:
The distriubtion must be different between label +ve and -ve
but must be similar between train and test sets.
""";
from wordcloud import STOPWORDS
def generate_ngrams(text, n_gram=1):
token = [token for token in text.lower().split(' ') if token != '' if token not in STOPWORDS]
ngrams = zip(*[token[i:] for i in range(n_gram)])
return [' '.join(ngram) for ngram in ngrams]
def get_ngram_dfs(df_train,NEG_TWEETS,col,n_gram=1):
# NEG_TWEETS = df_train[target] == 1
neg_ngrams = defaultdict(int)
pos_ngrams = defaultdict(int)
for tweet in df_train[NEG_TWEETS][col]:
for word in generate_ngrams(tweet,n_gram=n_gram):
neg_ngrams[word] += 1
for tweet in df_train[~NEG_TWEETS][col]:
for word in generate_ngrams(tweet,n_gram=n_gram):
pos_ngrams[word] += 1
df_neg_ngrams = pd.DataFrame(sorted(neg_ngrams.items(),
key=lambda x: x[1])[::-1])
df_pos_ngrams = pd.DataFrame(sorted(pos_ngrams.items(),
key=lambda x: x[1])[::-1])
return [df_neg_ngrams,df_pos_ngrams]
def plot_neg_pos_ngrams(n_gram_name,
df_neg_ngrams,df_pos_ngrams,N=20):
FS = 25
fig, axes = plt.subplots(ncols=2, figsize=(18, 20), dpi=100)
plt.tight_layout()
sns.barplot(y=df_neg_ngrams[0].values[:N], x=df_neg_ngrams[1].values[:N], ax=axes[0], color='red')
sns.barplot(y=df_pos_ngrams[0].values[:N], x=df_pos_ngrams[1].values[:N], ax=axes[1], color='green')
for i in range(2):
axes[i].spines['right'].set_visible(False)
axes[i].set_xlabel('')
axes[i].set_ylabel('')
axes[i].tick_params(axis='x', labelsize=FS)
axes[i].tick_params(axis='y', labelsize=FS)
axes[0].set_title(f'Top {N} most common {n_gram_name} in -Ve Tweets', fontsize=FS)
axes[1].set_title(f'Top {N} most common {n_gram_name} in +Ve Tweets', fontsize=FS)
plt.show()
df1, df2 = get_ngram_dfs(df_train,NEG_TWEETS,mce,n_gram=1)
plot_neg_pos_ngrams('Bigrams',df1,df2,N=20)
df1, df2 = get_ngram_dfs(df_train,NEG_TWEETS,mce,n_gram=2)
plot_neg_pos_ngrams('Bigrams',df1,df2,N=20)
df1, df2 = get_ngram_dfs(df_train,NEG_TWEETS,mce,n_gram=3)
plot_neg_pos_ngrams('Trigrams',df1,df2,N=20)
import plotly
import plotly.offline as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly import tools
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=False)
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
def get_top_n_words(corpus, n=None):
"""
List the top n words in a vocabulary according
to occurrence in a text corpus.
"""
vec = CountVectorizer(stop_words = 'english').fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:n]
df1.head()
df1.head(20)[::-1].plot.bar(x=1,y=0)
df_pos = df[df[target]==0.0]
df_neg = df[df[target]==1.0]
df_pos.head(2)
pos_uni = get_top_n_words(df_pos[mce],20)
neg_uni = get_top_n_words(df_neg[mce],20)
df_pos_uni = pd.DataFrame(pos_uni,columns=['Word','Count'])[::-1]
df_neg_uni = pd.DataFrame(neg_uni,columns=['Word','Count'])[::-1]
fig = df_pos_uni.plot.bar(x='Count',y='Word')
fig.update_traces(marker_color='green', opacity=0.6)
fig.update_layout(title_text='Most frequent +Ve **Unigrams** ')
fig.show()
fig = df_neg_uni.plot.bar(x='Count',y='Word')
fig.update_traces(marker_color='red', opacity=0.6)
fig.update_layout(title_text='Most frequent -Ve **Unigrams** ')
fig.show()